import threading
import os
from collections import Counter
import pandas as pd

from dataProcess.DrawImg import DrawImg


def worker(self, pice_dict, threed_mark):
    position = 1
    total = len(pice_dict)
    temp_author_info = []

    for word, count in pice_dict.items():
        # print(f"词语'{word}'出现的次数：{count}")
        print(f"线程：{threed_mark}正在分析，第{position}/{total}条数据")
        # 从csv里面找到对应的author_url，然后从这一行找到作者信息
        for i in range(len(self.df)):
            if word == self.df["author_url"][i]:
                temp_author_info.append({
                    "name": self.df["author"][i],
                    "value": count,
                    "author_url": self.df["author_url"][i],
                    "author_lever": self.df["author_lever"][i],
                    "author_articles": self.df["author_articles"][i],
                    "author_watchs": self.df["author_watchs"][i],
                    "author_fans": self.df["author_fans"][i],
                    # "author": self.df["author"][i],
                    # "count": count

                })
                break
        position = position + 1

        # 将工作完成的数据暂存

        self.threed_worked_storage[threed_mark] = temp_author_info


class ThreedAccelerateLaborStatiscal(object):
    # 初始化
    # 定义初始页面url
    def __init__(self):
        # 创建一个空列表来存储子DataFrame
        self.dfs = []
        self.my_BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        # csv_path：读取的csv文件(爬虫产物)
        self.csv_path = os.path.join(self.my_BASE_DIR, 'static', 'csv_collect', 'deep_articles.csv')
        # 大的DataFrame用来存储读取的csv数据
        self.df = pd.read_csv(self.csv_path, sep=",", header=0, encoding="utf-8")
        # 利用作者链接的唯一性，统计作者的发文次数
        self.author_url_list = []
        self.author_url_dict = {}
        # 供多线程处理的任务列表，里面是分割后的字典数据
        self.threed_works_list = []
        # 供多线程存放处理完的数据，键为线程编号
        self.threed_worked_storage = {}
        # 用户信息列表 数据，供api接口返回给前端
        self.author_info = []

    def loadData(self):
        # 清洗数据 生成目标列表
        for i in range(len(self.df)):
            if "" != self.df["author_url"][i] and "NOTFOUND" != self.df["author_url"][i]:
                self.author_url_list.append(self.df["author_url"][i])

        # 使用Counter函数统计每个数字出现的次数
        self.author_url_dict = dict(Counter(self.author_url_list))
        # 使用sorted()函数和lambda函数，按照字典的值排序
        # sorted()函数返回一个由键值对组成的列表
        sorted_list = sorted(self.author_url_dict.items(), key=lambda x: x[1], reverse=True)

        # 使用dict()函数，将列表转换为一个新的字典
        self.author_url_dict = dict(sorted_list)
        # print(self.author_url_dict)

        # 处理完毕，开始分割author_url_dict字典。
        # 思考： 40个数据分2个，80个数据分4个，160个数据分8，n个就n/20
        # 分析，还是用10来做，有思路，取个位取余数
        # !!!已测算过，共3656条数据
        position = 1
        # 用于给每个
        count_num = 1
        dict_length = len(self.author_url_dict)
        surplus_workload = dict_length % 10
        pice_dict = {}
        for word, count in self.author_url_dict.items():
            # print(f"词语'{word}'出现的次数：{count}")
            # 没错，直接追加,莫慌，稳
            print(f"正在拆分工作量，第{position}/{dict_length}条数据")
            pice_dict[word] = count

            # 先把余数工作量给干掉
            if position == surplus_workload:
                self.threed_works_list.append(pice_dict)
                # 清空字典，从新统计
                pice_dict = {}
            # 前个位数的工作量已经干完，兄弟们上啊
            if position > surplus_workload:
                # 不是第十个继续切，是第十个就丢到工作列表里
                if count_num != 10:
                    count_num = count_num + 1
                else:
                    self.threed_works_list.append(pice_dict)
                    # 清空字典，从新统计
                    pice_dict = {}
                    count_num = 1
            position = position + 1
        print("工作量拆分成功，请查验：————————————————————————————————————————————————")
        # print(self.threed_works_list)
        # print(len(self.threed_works_list)) ------366个元素
        # 工作量还是太大，而且数据也太大了,导致前端编码都错误了，这边先切割一下
        self.threed_works_list = self.threed_works_list[1:21]
        print(self.threed_works_list)

    def run(self):
        # 拆分工作量
        self.loadData()
        # 拆分工作量的份数知道所需工人数
        my_threed_workers_need = len(self.threed_works_list)

        threads = []

        # 多重影分身术 启动!
        for i in range(my_threed_workers_need):
            threed_mark = str(i)
            pice_dict = self.threed_works_list[i]
            # print(small_df)
            t = threading.Thread(target=worker, args=(self, pice_dict, threed_mark))
            t.start()
            threads.append(t)

        # 等待所有线程完成
        for t in threads:
            t.join()

        # 当你到达这里时，所有的子线程都已经完成
        print("所有的子线程都已经完成")

        # 打印一下看看工作情况
        print(self.threed_worked_storage)
        # !!!!!!!!!!!研究一下序列化字典数据

        # 进一步处理，整合拆分后的数据，归一
        for i in range(len(self.threed_worked_storage)):
            threed_mark = str(i)
            temp_author_info = self.threed_worked_storage[threed_mark]
            for item in temp_author_info:
                self.author_info.append(item)

        # 应老师要求，使用matplotlib绘图
        di = DrawImg()

        xAxis = []
        yAxis = []
        for item in self.author_info[0:11]:
            xAxis.append(item["name"])
            yAxis.append(item["value"])
        di.draw(xAxis, yAxis, 'laborRank_matplotlib_img')

        # print(self.author_info)
        return self.author_info


# 以脚本方式启动
if __name__ == "__main__":
    # 捕捉异常错误
    try:
        spider = ThreedAccelerateLaborStatiscal()
        spider.run()
    except Exception as e:
        print("错误:", e)
